Texthero is a python package to work with text data efficiently. It empowers NLP developers with a tool to quickly understand any text-based dataset and it provides a solid pipeline to clean and represent text data, from zero to hero Ref 1.
Processing 'description' and 'neighboardhood_overview':
The transfomed features are:
It also contains some kmens clustering to visualise some of the cloud of words in different type of properties accross London.
!pip install texthero
# visualisation and data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 200)
# system and files managment
import os
import sys
import pickle
import ast
# NLP
import texthero as hero
from texthero import preprocessing
from texthero import stopwords
from IPython import display
# modules added to system path
path_tools = os.path.abspath(r'/tools')
sys.path.extend([path_tools])
import tools.exploring_tools as mtools
df = pd.read_pickle('extended_data_set_pred_01.pick')
df.head()
variables_clean = ['description',
'neighborhood_overview']
df[variables_clean].head()
# sample of text embaded in html code
df.loc[0,'description']
df.loc[1,'description']
custom_pipeline_1 = [preprocessing.fillna
, preprocessing.lowercase
, preprocessing.remove_digits
, preprocessing.remove_punctuation
, preprocessing.remove_diacritics
, preprocessing.remove_stopwords
, preprocessing.remove_whitespace]
# , preprocessing.stem]
df['desc_clean'] = hero.clean(df['description'], pipeline=custom_pipeline_1)
df.loc[0,'desc_clean']
# from html code
default_stopwords = stopwords.DEFAULT
# add a list of stopwords to the stopwords
custom_stopwords = default_stopwords.union(set(["'", "br", "b",
"ot", "etc", "gue",
"london"]))
#Call remove_stopwords and pass the custom_stopwords list
df['desc_clean'] = hero.remove_stopwords(df['desc_clean'], custom_stopwords)
df.loc[0,'desc_clean']
df.loc[1,'desc_clean']
df.loc[7000,'desc_clean']
tw = hero.visualization.top_words(df['desc_clean'])
twp = tw / tw.sum() * 100
twp.head(15)
mtools.bar_plot_quick(twp.head(15), figsize=(10, 5), fontsize=15, vert=True)
plt.ylabel("Percentage of times word found in all the descriptions")
None
mtools.bar_plot_quick(tw.head(15), figsize=(10, 5), fontsize=15, vert=True)
plt.ylabel("Number of times word found in all the descriptions")
None
tw = hero.visualization.wordcloud(df['desc_clean'])
To have a more graphical representation, it will be used the Principle Component Analysis or PCA to compress the words into vector space. But first we need to find the Term Frequency- Inverse Document Frequency that is a numerical statisc intended to reflect how important a word is to a document in a colection of documents. In this case each observation.
The next is to run a K-means clustering to add colour to the graphical representation. Texthero takes a Series as input and Series as output so I can set the output to be a new column in the dataframe.
df['desc_clean_tfidf'] = (hero.tfidf(df['desc_clean'], max_features=100))
df['desc_clean_tfidf'].head()
df['desc_clean_pca'] = (df['desc_clean_tfidf'].pipe(hero.pca))
df['desc_clean_kmeans_3'] = (df['desc_clean_tfidf'].pipe(hero.kmeans, n_clusters=3).astype(str))
df['desc_clean_kmeans_4'] = (df['desc_clean_tfidf'].pipe(hero.kmeans, n_clusters=4).astype(str))
df['desc_clean_kmeans_5'] = (df['desc_clean_tfidf'].pipe(hero.kmeans, n_clusters=5).astype(str))
df['desc_clean_kmeans_6'] = (df['desc_clean_tfidf'].pipe(hero.kmeans, n_clusters=5).astype(str))
df['desc_clean_kmeans_7'] = (df['desc_clean_tfidf'].pipe(hero.kmeans, n_clusters=7).astype(str))
df.head()
hero.scatterplot(df, 'desc_clean_pca', color='desc_clean_kmeans_3', title="3 Clusters of Airbnb Descriptions")
fig, ax = plt.subplots(figsize=(25, 15))
sns.scatterplot(data=df, x="longitude",
y="latitude",
hue="desc_clean_kmeans_3",
style="room_type",
ax=ax)
df_clust_3_details = df.groupby(['neighbourhood_cleansed', 'desc_clean_kmeans_3', 'room_type'])['desc_clean_kmeans_3'].count().to_frame('count')
fig, ax = plt.subplots(figsize=(25, 15))
df_clust_3_details.unstack().plot(kind='bar', ax=ax)
plt.ylabel('Number of listings')
plt.xticks(rotation=90)
None
df_clust_3_details_02 = df.groupby(['desc_clean_kmeans_3', 'room_type'])['desc_clean_kmeans_3'].count().to_frame('count')
fig, ax = plt.subplots(figsize=(10, 8))
df_clust_3_details_02.unstack().plot(kind='bar', ax=ax)
plt.xticks(rotation=90)
None
fig, ax = plt.subplots(figsize=(18, 8))
df.groupby(['neighbourhood_cleansed', 'room_type'])['desc_clean_kmeans_3'].count().to_frame('count').unstack().plot(kind='bar', ax=ax)
plt.xticks(rotation=90)
None
tw_3_0 = hero.visualization.wordcloud(df.loc[df['desc_clean_kmeans_3'].isin(['0']), 'desc_clean'])
tw_3_1 = hero.visualization.wordcloud(df.loc[df['desc_clean_kmeans_3'].isin(['1']), 'desc_clean'])
tw_3_2 = hero.visualization.wordcloud(df.loc[df['desc_clean_kmeans_3'].isin(['2']), 'desc_clean'])
hero.scatterplot(df, 'desc_clean_pca', color='desc_clean_kmeans_4', title="4 Clusters of Airbnb Descriptions")
hero.scatterplot(df, 'desc_clean_pca', color='desc_clean_kmeans_5', title="5 Clusters of Airbnb Descriptions")
hero.scatterplot(df, 'desc_clean_pca', color='desc_clean_kmeans_6', title="6 Clusters of Airbnb Descriptions")
hero.scatterplot(df, 'desc_clean_pca', color='desc_clean_kmeans_7', title="5 Clusters of Airbnb Descriptions")
display.Image("https://www.cityam.com/assets/uploads/content/2016/09/150203-london-boroughs-57edad1271160.png")
fig, ax = plt.subplots(figsize=(25, 10))
sns.boxplot(x="neighbourhood_cleansed", y="predicted_price_01", hue='desc_clean_kmeans_3', data=df)
plt.xticks(rotation=90)
None
fig, ax = plt.subplots(figsize=(25, 10))
sns.boxplot(x="neighbourhood_cleansed", y="predicted_price_01", hue='desc_clean_kmeans_5', data=df)
plt.xticks(rotation=90)
None
# PCA representation of description
df['desccription_pca_x'] = df['desc_clean_pca'].apply(lambda x: x[0])
df['desccription_pca_y'] = df['desc_clean_pca'].apply(lambda x: x[1])
df.loc[0,'neighborhood_overview']
df.loc[1,'neighborhood_overview']
df['neigh_clean'] = hero.clean(df['neighborhood_overview'], pipeline=custom_pipeline_1)
df.loc[0,'neigh_clean']
# from html code
default_stopwords = stopwords.DEFAULT
# add a list of stopwords to the stopwords
custom_stopwords = default_stopwords.union(set(["'", "br", "b",
"ot", "etc", "gue",
"london", "also", "road",
"great", "street", "away"]))
#Call remove_stopwords and pass the custom_stopwords list
df['neigh_clean'] = hero.remove_stopwords(df['neigh_clean'], custom_stopwords)
df.loc[0,'neigh_clean']
df.loc[1,'neigh_clean']
df.loc[7000,'neigh_clean']
tw = hero.visualization.top_words(df['neigh_clean'])
twp = tw / tw.sum() * 100
twp.head(15)
mtools.bar_plot_quick(twp.head(15), figsize=(10, 5), fontsize=15, vert=True)
plt.ylabel("Percentage of times word found in all the neighborhood_overview")
None
mtools.bar_plot_quick(tw.head(15), figsize=(10, 5), fontsize=15, vert=True)
plt.ylabel("Number of times word found in all the neighborhood_overview")
None
tw = hero.visualization.wordcloud(df['neigh_clean'])
To have a more graphical representation, it will be used the Principle Component Analysis or PCA to compress the words into vector space. But first we neeed to find the Term Frequency- Inverse Document Frequency that is a numerical statiscs intended to reflect how important a word is to a document in a colection of corpus. In this case each observatio.
The next is to run a K-means clustering to add color to the graphical representation. Texthero takes a Series as input and Series as output so I can set the output to be a new column in the dataframe.
df['neigh_clean_tfidf'] = (hero.tfidf(df['neigh_clean'], max_features=100))
df['neigh_clean_tfidf'].head()
df['neigh_clean_pca'] = (df['neigh_clean_tfidf'].pipe(hero.pca))
df['neigh_clean_kmeans_3'] = (df['neigh_clean_tfidf'].pipe(hero.kmeans, n_clusters=3).astype(str))
df['neigh_clean_kmeans_4'] = (df['neigh_clean_tfidf'].pipe(hero.kmeans, n_clusters=4).astype(str))
df['neigh_clean_kmeans_5'] = (df['neigh_clean_tfidf'].pipe(hero.kmeans, n_clusters=5).astype(str))
df['neigh_clean_kmeans_6'] = (df['neigh_clean_tfidf'].pipe(hero.kmeans, n_clusters=5).astype(str))
df['neigh_clean_kmeans_7'] = (df['neigh_clean_tfidf'].pipe(hero.kmeans, n_clusters=7).astype(str))
df.head()
hero.scatterplot(df, 'neigh_clean_pca', color='neigh_clean_kmeans_3', title="3 Clusters of Airbnb Neighbourhood Overview")
fig, ax = plt.subplots(figsize=(25, 15))
sns.scatterplot(data=df, x="longitude",
y="latitude",
hue="neigh_clean_kmeans_3",
style="room_type",
ax=ax)
df_clust_3_details = df.groupby(['neighbourhood_cleansed', 'neigh_clean_kmeans_3', 'room_type'])['desc_clean_kmeans_3'].count().to_frame('count')
fig, ax = plt.subplots(figsize=(25, 15))
df_clust_3_details.unstack().plot(kind='bar', ax=ax)
plt.ylabel('Number of listings')
plt.xticks(rotation=90)
None
df_clust_3_details_unst = df_clust_3_details.unstack().reset_index()
df_clust_3_details_unst_c2 = df_clust_3_details_unst[df_clust_3_details_unst['neigh_clean_kmeans_3'].isin(['2'])]
df_clust_3_details_unst_c2.set_index('neighbourhood_cleansed', inplace=True)
del df_clust_3_details_unst_c2['neigh_clean_kmeans_3']
fig, ax = plt.subplots(figsize=(18, 10))
df_clust_3_details_unst_c2.plot(kind='bar', ax=ax)
df_clust_3_details_unst.info()
df_clust_3_details_03 = df.groupby(['neigh_clean_kmeans_3', 'room_type'])['neigh_clean_kmeans_3'].count().to_frame('count')
fig, ax = plt.subplots(figsize=(10, 8))
df_clust_3_details_03.unstack().plot(kind='bar', ax=ax)
plt.xticks(rotation=90)
None
fig, ax = plt.subplots(figsize=(18, 8))
df.groupby(['neighbourhood_cleansed', 'room_type'])['neigh_clean_kmeans_3'].count().to_frame('count').unstack().plot(kind='bar', ax=ax)
plt.xticks(rotation=90)
None
tw_3_2 = hero.visualization.wordcloud(df.loc[df['neigh_clean_kmeans_3'].isin(['0']), 'neigh_clean'])
tw_3_2 = hero.visualization.wordcloud(df.loc[df['neigh_clean_kmeans_3'].isin(['1']), 'neigh_clean'])
tw_3_3 = hero.visualization.wordcloud(df.loc[df['neigh_clean_kmeans_3'].isin(['2']), 'neigh_clean'])
hero.scatterplot(df, 'neigh_clean_pca', color='neigh_clean_kmeans_4', title="4 Clusters of Neighboardhood Overview")
hero.scatterplot(df, 'neigh_clean_pca', color='neigh_clean_kmeans_5', title="5 Clusters of Neighboardhood Overview")
hero.scatterplot(df, 'neigh_clean_pca', color='neigh_clean_kmeans_6', title="6 Clusters of Airbnb Neighboardhood Overview")
hero.scatterplot(df, 'neigh_clean_pca', color='neigh_clean_kmeans_7', title="5 Clusters of Airbnb Neighboardhood Overview")
display.Image("https://www.cityam.com/assets/uploads/content/2016/09/150203-london-boroughs-57edad1271160.png")
fig, ax = plt.subplots(figsize=(25, 10))
sns.boxplot(x="neighbourhood_cleansed", y="predicted_price_01", hue='neigh_clean_kmeans_3', data=df)
plt.xticks(rotation=90)
None
fig, ax = plt.subplots(figsize=(25, 10))
sns.boxplot(x="neighbourhood_cleansed", y="predicted_price_01", hue='neigh_clean_kmeans_5', data=df)
plt.xticks(rotation=90)
None
# PCA representation of description
df['neighborhood_overview_pca_x'] = df['neigh_clean_pca'].apply(lambda x: x[0])
df['neighborhood_overview_pca_y'] = df['neigh_clean_pca'].apply(lambda x: x[1])
# df.to_pickle('extended_data_set_pred_01_pca.pick')